Tutorial step 7: adding MMX optimizations

The first step to optimizing our filter would be to use assembly code, but we'll spend some time on MMX optimizations first, because MMX can really speed this code up. The trick to using MMX is knowing when we can and can't use it; detection code isn't good enough because the user may have intentionally forced or disabled MMX support in VirtualDub's preferences dialog.

Checking to see if CPU optimizations are available

Two functions in the FilterFunctions struct are useful here.

FPU optimizations are rare in pixel processing; the only VirtualDub filter that uses them is the bilinear resize filter, because it requires 6 64-bit multiplies per pixel, which are more quickly done with the FPU than the integer units. MMX optimizations are the biggie; use them if at all possible.

The FPU and MMX enable flags are guaranteed not to change between filters or during processing. It is highly recommended that you globally cache the flags during startProc processing, and read the global flag in runProc.

Adding MMX optimizations to our filter

First, modify startProc to cache the MMX flag.

static bool g_MMXenabled;

int tutorialStartProc(FilterActivation *fa, const FilterFunctions *ff) {
    MyFilterData *mfd = (MyFilterData *)fa->filter_data;
    int i;

    g_MMXenabled = ff->isMMXEnabled();

Now, write the MMX acceleration routine. For simplicity, it will only handle one scanline. I'm not usually very clear when writing MMX code, so don't feel bad if it takes a couple of passes to understand. ^^;;

void __declspec(naked) doscan_MMX(Pixel32 *dst, Pixel32 *src, int w, long frac, long bias, int fDouble) {
    static const __int64 Rmask = 0x0000FFFF00000000i64;

    __asm {
        push        ebp
        push        edi
        push        esi
        push        ebx

        mov         eax,[esp+4+16]
        mov         edx,[esp+8+16]

        mov         ecx,[esp+12+16]
        neg         ecx
        shl         ecx,2
        sub         eax,ecx
        sub         edx,ecx

        movq        mm6,Rmask
        movd        mm4,[esp+20+16]
        psllq       mm4,16
        movd        mm5,[esp+16+16]
        punpcklwd   mm5,mm5
        pxor        mm7,mm7

        mov         ebx,dword ptr [esp+24+16]
        or          ebx,ebx
        jz          xloop1

        sub         eax,ecx

xloop2:
        movd        mm0,[edx+ecx]       ;mm0 = pixel
        movq        mm1,mm6             ;mm1 = R mask
        punpcklbw   mm0,mm7             ;unpack pixel to words
        pand        mm1,mm0             ;mm1 = red component
        pmulhw      mm0,mm5             ;scale green and blue
        paddw       mm1,mm4             ;add green bias
        paddw       mm0,mm1             ;add scaled green/blue
        packuswb    mm0,mm0             ;repack pixel to bytes
        movq        [eax+ecx*2],mm0     ;write 2 pixels
        add         ecx,4
        jne         xloop2
        jmp         short xit

xloop1:
        movd        mm0,[edx+ecx]       ;mm0 = pixel
        movq        mm1,mm6             ;mm1 = R mask
        punpcklbw   mm0,mm7             ;unpack pixel to words
        pand        mm1,mm0             ;mm1 = red component
        paddw       mm0,mm0             ;double g/b channels beforehand
        pmulhw      mm0,mm5             ;scale green and blue
        paddw       mm1,mm4             ;add green bias
        paddw       mm0,mm1             ;add scaled green/blue
        packuswb    mm0,mm0             ;repack pixel to bytes
        movd        [eax+ecx],mm0       ;write pixel
        add         ecx,4
        jne         xloop1

xit:
        pop         ebx
        pop         esi
        pop         edi
        pop         ebp
        ret
    }
}

Finally, add the MMX optimizations to runProc.

int tutorialRunProc(const FilterActivation *fa, const FilterFunctions *ff) {
    MyFilterData *mfd = (MyFilterData *)fa->filter_data;
    PixDim w, h;
    Pixel32 *src, *dst;
    const Pixel32 *grn_tab = mfd->grn_tab;
    const Pixel32 *blu_tab = mfd->blu_tab;

    src = (Pixel32 *)fa->src.data;
    dst = (Pixel32 *)fa->dst.data;

    h = fa->src.h;
    do {
        w = fa->src.w;

        if (g_MMXenabled) {

            doscan_MMX(dst, src, w,
                    mfd->fThird ? 0x2AAA : 0x4000,
                    mfd->fThird ? 0x55 : 0x80,
                    mfd->fExpand);


            src = (Pixel32 *)((char *)src + fa->src.pitch);
            dst = (Pixel32 *)((char *)dst + fa->dst.pitch);

    // double the routine for speed; an if would kill us in the
    // inner loop, but in the outer loop it's ok

        } else {
		    if (mfd->fExpand)
                do {
                    Pixel32 old_pixel, new_pixel;

                    old_pixel = *src++;

                    new_pixel = (old_pixel & 0xFF0000)
                              + grn_tab[(old_pixel>>8) & 0xff]
                              + blu_tab[old_pixel & 0xff];

                    *dst++ = new_pixel;
                    *dst++ = new_pixel;
                } while(--w);
            else
                do {
                    Pixel32 old_pixel, new_pixel;

                    old_pixel = *src++;

                    new_pixel = (old_pixel & 0xFF0000)
                              + grn_tab[(old_pixel>>8) & 0xff]
                              + blu_tab[old_pixel & 0xff];

                    *dst++ = new_pixel;
                } while(--w);

            src = (Pixel32 *)((char *)src + fa->src.modulo);
            dst = (Pixel32 *)((char *)dst + fa->dst.modulo);
        }

    } while(--h);

    if (g_MMXenabled)
        __asm emms

    return 0;
}

MMX optimizations basically just require the MMX code; you spend 99.99% of your time wringing out more speed. Two more important notes:

Onto the next, and the final, chapter of this tutorial!

[up] back to main page
[prev] tutorial[6]: supporting user configuration
[next] tutorial[8]: adding job (batch) support


VirtualDub external filter SDK 1.05©1999-2001 Avery Lee <phaeron@virtualdub.org>